/*
Input: tmp/strassen_prep_ltw18_single [list of all addresses]

Output: tmp/voter_addresses_unique_geocoded.dta [addresses with added geocodes]

Main task:
	> merge geocodes from preprocessed geocoding (default)

	OR (in case you run the geocoding yourself)

	> prepare addresses for geocoding
	> geocoding
	> cleaning

Note: 
	> We use the module 'opencagegeo' geo-reference addresses. The programm uses openstreetmap (OSM).
	
		>> Installation:
		* Install the Stata module and two required user-written stata libraries from SSC:
			. ssc install opencagegeo
			. ssc install libjson
			. ssc install insheetjson
		>> Batch geocode addresses (forward geocoding):
		opencagegeo , key(XXXXXX) countrycode(de) language(de) ///
				city(stadt) street(strasse) number(nummer)
				
		> Registration for API key is required
		> basic key allows for only 2,500 addresses per day and 1 request/sec 
			(we purchased a temporary key allowing for more addresses per day and also 25-40 requests/sec )
		> 'opencagegeo' needs the following inputs (in addition to your API key): 
			countrycode(de): as Munich is in Germany
			language(de): street names are in Germany
			city(stadt): variable for the city (was generated by us)
			street(strasse): street name, coming from electoral roll
			number(nummer): address number, potentially containing letters (6, 6a, 6b, etc.)
			
		>> Resulting output:
			> generated variables have prefix "g_"
				g_quality gives how detailed the address was found: city-, postcode-, street-, number-level (number most exact; g_quality==7)
				g_lon: longitude of address
				g_lat: latitude of address
			> addresses not geocode at number-level are inspected manually. Often an address exists but was not added to OSM
			> Haimhauserstrasse 13 was at a wrong location (corrected manually)
			> addresses geolocated outside of Munich were also inspected manually and if not found in Munich dropped
			
	If you want re-run the geocoding yourself set the global RERUN_GEOCODE_VOTER to 1 
	and set your local API key in the _master.do.
	
	Otherwise, this dofile simply merges preprocessed geo-coordinates located in
		03_geocoded_addresses/ voter_addresses_ltw18_geocoded.dta
	to the list of unique voter addresses.

*/

 
* PULL addresses from 2018
	use "$tmp/strassen_prep_ltw18_single", clear 

 if $RERUN_GEOCODE_VOTER ==0 {
 	
	//variables only needed for geocoding
	drop strasse stadt
	
	// MERGE geocodes to voter addresses 
	merge 1:1 stadtbez merge_strasse nummer using "$geocod/voter_addresses_ltw18_geocoded.dta", assert(1 3) keep(3) nogen //some addresses not possible to geocode
	
	lab var g_lat "latitude of voter address"
	lab var g_lon "longitude of voter address"
	
	// save
	save "$tmp/voter_addresses_unique_geocoded.dta", replace
 }
 else if $RERUN_GEOCODE_VOTER ==1 {

** 1. Prep 
	// clean (some addresses contain a city)
	split strasse, p("(")
	replace stadt = strasse2 if missing(strasse2)==0
	drop strasse1 strasse2
	replace stadt = substr(stadt,1,strlen(stadt)-1) if substr(stadt,strlen(stadt),1)==")"
	replace stadt="Pullach" if stadt=="Pul"

	sort stadtbez strasse nummer

	// KEEP only necessary variables (for geocoding or merging afterwards)
	keep  stadt stadtbez merge_strasse strasse nummer
	order stadt stadtbez merge_strasse strasse nummer

** 2. Run geocoding using OPENCAGE
	opencagegeo , key($api_key) countrycode(de) language(de) ///
				city(stadt) street(strasse) number(nummer) 

	lab var g_lat "latitude of voter address"
	lab var g_lon "longitude of voter address"

	// save raw results 
	save "$tmp/voter_ltw18_einzel_geo_raw", replace

** 3. Assess quality of result and manually correct errors
	// manually correct coordinates 
	// -->  g_quality==4 (city)
	replace g_lat = "48.0663858" if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="1"
	replace g_lon = "11.5172119" if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="1"
	replace g_quality = 7 		 if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="1"
	replace g_lat = "48.0662568" if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="3"
	replace g_lon = "11.5172146" if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="3"
	replace g_quality = 7 		 if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="3"
	replace g_lat = "48.0660954" if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="5"
	replace g_lon = "11.5171395" if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="5"
	replace g_quality = 7 		 if merge_strasse=="wolfratshauserstrasse(pul)" & nummer=="5"
	replace g_lat = "48.1848956" if merge_strasse=="weimarerstrasse" & nummer=="30"
	replace g_lon = "11.5900233" if merge_strasse=="weimarerstrasse" & nummer=="30"
	replace g_quality = 7 		 if merge_strasse=="weimarerstrasse" & nummer=="30"

	// ---> g_quality==5 (postcode)
	replace g_lat = "48.1369886" if merge_strasse=="hofstatt" & nummer=="4"
	replace g_lon = "11.5712595" if merge_strasse=="hofstatt" & nummer=="4"
	replace g_quality = 7 		 if merge_strasse=="hofstatt" & nummer=="4"
	replace g_lat = "48.1347055" if merge_strasse=="ammoosfeld" & nummer=="15b"
	replace g_lon = "11.6585821" if merge_strasse=="ammoosfeld" & nummer=="15b"
	replace g_quality = 7 		 if merge_strasse=="ammoosfeld" & nummer=="15b"

	// check quality 
	tab g_quality
	// DROP addresses that could not be geocoded
	drop if g_quality<7
	
	// KEEP only addresses in Munich
	keep if g_city=="München"

	// CORRECT mis-geolocated address (wrong location in OSM)
	replace g_lat = "48.1623233" if merge_strasse=="haimhauserstrasse" & nummer=="13"
	replace g_lon = "11.5901594" if merge_strasse=="haimhauserstrasse" & nummer=="13"

	// ASSERT uniqueness
	isid stadtbez merge_strasse nummer
	
	// KEEP only relevant variables
	keep stadtbez merge_strasse nummer g_lat g_lon

	// save: CLEANED geocoding
	save "$tmp/voter_addresses_unique_geocoded.dta", replace
	
}
